{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scikit-learn from CSV\n",
    "\n",
    "This notebook reads the CSV data written out by the Dataflow program of [1_explore.ipynb](./1_explore.ipynb) and trains a scikit-learn model on Cloud ML Engine.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting cloudml-hypertune\n",
      "Installing collected packages: cloudml-hypertune\n",
      "Successfully installed cloudml-hypertune-0.1.0.dev5\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install cloudml-hypertune"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "BUCKET = 'cloud-training-demos-ml'\n",
    "PROJECT = 'cloud-training-demos'\n",
    "REGION = 'us-central1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['BUCKET'] = BUCKET\n",
    "os.environ['PROJECT'] = PROJECT\n",
    "os.environ['REGION'] = REGION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Updated property [core/project].\n",
      "Updated property [compute/region].\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "gcloud config set project $PROJECT\n",
    "gcloud config set compute/region $REGION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%aimport ltgpred"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train sklearn model locally"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mkdir: cannot create directory ‘preproc/csv’: File exists\n",
      "Copying gs://cloud-training-demos-ml/lightning/preproc_0.02_32_2/csv/eval-00000-of-00253...\n",
      "Copying gs://cloud-training-demos-ml/lightning/preproc_0.02_32_2/csv/train-00000-of-00522...\n",
      "- [2 files][ 14.2 MiB/ 14.2 MiB]                                                \n",
      "Operation completed over 2 objects/14.2 MiB.                                     \n"
     ]
    }
   ],
   "source": [
    "!mkdir -p preproc/csv\n",
    "!gcloud storage cp gs://$BUCKET/lightning/preproc_0.02_32_2/csv/*-00000-* preproc/csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cx</th>\n",
       "      <th>cy</th>\n",
       "      <th>lat</th>\n",
       "      <th>lon</th>\n",
       "      <th>mean_ref_sm</th>\n",
       "      <th>max_ref_sm</th>\n",
       "      <th>mean_ref_big</th>\n",
       "      <th>max_ref_big</th>\n",
       "      <th>ltg_sm</th>\n",
       "      <th>ltg_big</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1186</td>\n",
       "      <td>770</td>\n",
       "      <td>47.72</td>\n",
       "      <td>-109.60</td>\n",
       "      <td>0.397961</td>\n",
       "      <td>0.454567</td>\n",
       "      <td>0.489967</td>\n",
       "      <td>0.885870</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.017041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>586</td>\n",
       "      <td>898</td>\n",
       "      <td>35.72</td>\n",
       "      <td>-107.04</td>\n",
       "      <td>0.391342</td>\n",
       "      <td>0.500481</td>\n",
       "      <td>0.407276</td>\n",
       "      <td>0.885218</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.044970</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>626</td>\n",
       "      <td>1010</td>\n",
       "      <td>36.52</td>\n",
       "      <td>-104.80</td>\n",
       "      <td>0.574724</td>\n",
       "      <td>0.818150</td>\n",
       "      <td>0.625667</td>\n",
       "      <td>0.931585</td>\n",
       "      <td>0.92</td>\n",
       "      <td>0.288521</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>42</td>\n",
       "      <td>1090</td>\n",
       "      <td>24.84</td>\n",
       "      <td>-103.20</td>\n",
       "      <td>0.688098</td>\n",
       "      <td>0.831441</td>\n",
       "      <td>0.527593</td>\n",
       "      <td>0.894942</td>\n",
       "      <td>0.16</td>\n",
       "      <td>0.242840</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>318</td>\n",
       "      <td>1298</td>\n",
       "      <td>30.36</td>\n",
       "      <td>-99.04</td>\n",
       "      <td>0.481775</td>\n",
       "      <td>0.854034</td>\n",
       "      <td>0.469902</td>\n",
       "      <td>0.869434</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.084734</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     cx    cy    lat     lon  mean_ref_sm  max_ref_sm  mean_ref_big  \\\n",
       "0  1186   770  47.72 -109.60     0.397961    0.454567      0.489967   \n",
       "1   586   898  35.72 -107.04     0.391342    0.500481      0.407276   \n",
       "2   626  1010  36.52 -104.80     0.574724    0.818150      0.625667   \n",
       "3    42  1090  24.84 -103.20     0.688098    0.831441      0.527593   \n",
       "4   318  1298  30.36  -99.04     0.481775    0.854034      0.469902   \n",
       "\n",
       "   max_ref_big  ltg_sm   ltg_big  \n",
       "0     0.885870    0.00  0.017041  \n",
       "1     0.885218    0.00  0.044970  \n",
       "2     0.931585    0.92  0.288521  \n",
       "3     0.894942    0.16  0.242840  \n",
       "4     0.869434    0.00  0.084734  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_csv('preproc/csv/train-00000-of-00522', \n",
    "        header=None,\n",
    "        names=[\n",
    "            'cx', 'cy', 'lat', 'lon', 'mean_ref_sm', 'max_ref_sm',\n",
    "            'mean_ref_big', 'max_ref_big', 'ltg_sm', 'ltg_big', 'has_ltg'\n",
    "        ])\n",
    "del df['has_ltg']\n",
    "#df = pd.get_dummies(df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "export CLOUDSDK_PYTHON=$(which python3)\n",
    "OUTDIR=skl_trained\n",
    "DATADIR=${PWD}/preproc/csv\n",
    "rm -rf $OUTDIR\n",
    "gcloud ml-engine local train \\\n",
    "    --module-name=trainer.train_skl --package-path=${PWD}/ltgpred/trainer \\\n",
    "    -- \\\n",
    "    --job-dir=$OUTDIR --train_data=${DATADIR}/train* --eval_data=${DATADIR}/eval*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training sklearn model on CMLE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Overwriting largemachine.yaml\n"
     ]
    }
   ],
   "source": [
    "%%writefile largemachine.yaml\n",
    "trainingInput:\n",
    "  scaleTier: CUSTOM\n",
    "  masterType: complex_model_l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "jobId: ltgpred_skl_190423_034420\n",
      "state: QUEUED\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR: Python 3 and later is not compatible with the Google Cloud SDK. Please use Python version 2.7.x.\n",
      "\n",
      "If you have a compatible Python interpreter installed, you can use it by setting\n",
      "the CLOUDSDK_PYTHON environment variable to point to it.\n",
      "\n",
      "Job [ltgpred_skl_190423_034420] submitted successfully.\n",
      "Your job is still active. You may view the status of your job with the command\n",
      "\n",
      "  $ gcloud ml-engine jobs describe ltgpred_skl_190423_034420\n",
      "\n",
      "or continue streaming the logs with the command\n",
      "\n",
      "  $ gcloud ml-engine jobs stream-logs ltgpred_skl_190423_034420\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "export CLOUDSDK_PYTHON=$(which python3)\n",
    "OUTDIR=gs://${BUCKET}/lightning/skl_trained\n",
    "DATADIR=gs://$BUCKET/lightning/preproc_0.02_32_2/csv\n",
    "JOBNAME=ltgpred_skl_$(date -u +%y%m%d_%H%M%S)\n",
    "gcloud storage rm --recursive --continue-on-error $OUTDIR\n",
    "gcloud ml-engine jobs submit training $JOBNAME \\\n",
    "    --module-name=ltgpred.trainer.train_skl --package-path=${PWD}/ltgpred --job-dir=$OUTDIR \\\n",
    "    --region=${REGION} --scale-tier=custom --config=largemachine.yaml \\\n",
    "    --python-version=3.5 --runtime-version=1.9 \\\n",
    "    -- \\\n",
    "    --train_data=${DATADIR}/train-001* --eval_data=${DATADIR}/eval-0000*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When I ran it, training finished with a RMSE=0.43.\n",
    "This can serve as a benchmark.\n",
    "\n",
    "Note, however, that I trained and evaluated on a subset of the data, since even the \"largemachine\" doesn't have the memory needed to hold entire dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copyright 2018 Google Inc. Licensed under the Apache License, Version 2.0 (the \\\"License\\\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \\\"AS IS\\\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
